#!/usr/bin/python
# -*- coding: utf-8 -*-

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import os

url = "http://www.gov.cn/guoqing/2006-02/16/content_2616810.htm"

savePath = "/home/xiaoyu/Documents/DataVisualizationByD3/D3/publicdata/data/Goverment/ROTW";


def getUrlList():
    html = urlopen(url)
    bsObj = BeautifulSoup(html, "html.parser")

    # 根据a的href信息来做爬虫
    urlList = bsObj.find("table", {"align": "center"}).findAll("a", href=re.compile("^(.+)(content_)(.+)$"))
    return urlList

def getReport(year,u):
    try:
        html = urlopen(u)
        bsObj = BeautifulSoup(html, "html.parser")
        p = bsObj.findAll("p")

        #create file
        file = open(savePath+"/"+year+"年政府工作报告","w+")
        for i in p:
            file.write(i.get_text()+"\n")
        file.close()
        return u + ":ok"
    except Exception as err:
        print(err)
        return u+":wrong"



if __name__ == '__main__':

    if os.path.exists(savePath) == False:
        os.mkdir(savePath)
    urlList = getUrlList()

    for i in range(len(urlList)):
        year = urlList[i].get_text()
        print(str(i)+":"+getReport(year,urlList[i].attrs['href']))

